knitr::opts_knit$set(
root.dir = '.',
keep.tex = TRUE
);
knitr::opts_chunk$set(
fig.width = 9,
fig.height = 9,
fig.align = 'center',
fig.pos = 'h',
fig.show = 'hold',
echo = TRUE,
warning = FALSE,
message = FALSE,
cache = FALSE
);
nutrimouse data from the
mixOmics R package and investigate its structure.library(mixOmics)
A data object provided by an R package can be loaded with
data. Its structure can be obainted with str,
length, dim, etc.
data("nutrimouse")
## display the structure of the nutrimouse object
str(nutrimouse)
## List of 4
## $ gene :'data.frame': 40 obs. of 120 variables:
## ..$ X36b4 : num [1:40] -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
## ..$ ACAT1 : num [1:40] -0.65 -0.68 -0.74 -0.69 -0.71 -0.69 -0.62 -0.69 -0.66 -0.62 ...
## ..$ ACAT2 : num [1:40] -0.84 -0.91 -1.1 -0.65 -0.54 -0.8 -1 -0.91 -0.74 -0.79 ...
## ..$ ACBP : num [1:40] -0.34 -0.32 -0.46 -0.41 -0.38 -0.32 -0.44 -0.37 -0.39 -0.36 ...
## ..$ ACC1 : num [1:40] -1.29 -1.23 -1.3 -1.26 -1.21 -1.13 -1.22 -1.29 -1.15 -1.21 ...
## ..$ ACC2 : num [1:40] -1.13 -1.06 -1.09 -1.09 -0.89 -0.79 -1 -1.06 -1.08 -0.82 ...
## ..$ ACOTH : num [1:40] -0.93 -0.99 -1.06 -0.93 -1 -0.93 -0.94 -1.05 -0.88 -0.92 ...
## ..$ ADISP : num [1:40] -0.98 -0.97 -1.08 -1.02 -0.95 -0.97 -0.94 -1.02 -0.98 -0.99 ...
## ..$ ADSS1 : num [1:40] -1.19 -1 -1.18 -1.07 -1.08 -1.07 -1.05 -1.16 -1.05 -1 ...
## ..$ ALDH3 : num [1:40] -0.68 -0.62 -0.75 -0.71 -0.76 -0.75 -0.67 -0.75 -0.66 -0.69 ...
## ..$ AM2R : num [1:40] -0.59 -0.58 -0.66 -0.65 -0.59 -0.55 -0.66 -0.66 -0.53 -0.62 ...
## ..$ AOX : num [1:40] -0.16 -0.12 -0.16 -0.17 -0.31 -0.23 -0.09 -0.22 -0.06 -0.23 ...
## ..$ BACT : num [1:40] -0.22 -0.32 -0.32 -0.32 -0.31 -0.29 -0.25 -0.21 -0.15 -0.2 ...
## ..$ BIEN : num [1:40] -0.89 -0.88 -0.89 -0.77 -0.97 -0.84 -0.86 -0.9 -0.74 -0.76 ...
## ..$ BSEP : num [1:40] -0.69 -0.6 -0.7 -0.67 -0.68 -0.55 -0.67 -0.66 -0.6 -0.58 ...
## ..$ Bcl.3 : num [1:40] -1.18 -1.07 -1.17 -1.12 -0.93 -1.08 -1.03 -1.01 -1.01 -1.1 ...
## ..$ C16SR : num [1:40] 1.66 1.65 1.57 1.61 1.66 1.7 1.58 1.62 1.72 1.55 ...
## ..$ CACP : num [1:40] -0.92 -0.87 -1.02 -0.89 -0.93 -0.97 -0.97 -0.96 -0.85 -0.95 ...
## ..$ CAR1 : num [1:40] -0.97 -0.92 -0.98 -0.97 -1.06 -1.03 -0.91 -1.11 -0.85 -0.99 ...
## ..$ CBS : num [1:40] -0.26 -0.36 -0.4 -0.39 -0.35 -0.31 -0.32 -0.4 -0.26 -0.39 ...
## ..$ CIDEA : num [1:40] -1.21 -1.17 -1.29 -1.18 -1.15 -1.14 -1.16 -1.26 -1.12 -1.08 ...
## ..$ COX1 : num [1:40] -1.11 -1.06 -1.17 -1.03 -0.99 -1.03 -1.15 -1.18 -0.94 -1.07 ...
## ..$ COX2 : num [1:40] -1.18 -1.06 -1.14 -1.13 -1.1 -1.16 -1.06 -1.24 -1.23 -1.09 ...
## ..$ CPT2 : num [1:40] -0.87 -0.87 -0.95 -0.88 -0.91 -0.92 -0.86 -0.93 -0.82 -0.88 ...
## ..$ CYP24 : num [1:40] -1.37 -1.14 -1.3 -1.27 -1.2 -1.11 -1.12 -1.3 -1.14 -1.08 ...
## ..$ CYP26 : num [1:40] -1.21 -1.12 -1.22 -1.18 -1.16 -1.1 -1.07 -1.23 -1.1 -1.1 ...
## ..$ CYP27a1 : num [1:40] -0.71 -0.62 -0.78 -0.71 -0.69 -0.6 -0.69 -0.81 -0.62 -0.62 ...
## ..$ CYP27b1 : num [1:40] -1.31 -1.14 -1.29 -1.27 -1.2 -1.15 -1.17 -1.28 -1.13 -1.15 ...
## ..$ CYP2b10 : num [1:40] -1.23 -1.2 -1.32 -1.23 -1.22 -1.1 -1.07 -1.26 -1.19 -1.1 ...
## ..$ CYP2b13 : num [1:40] -1.19 -1.06 -1.25 -1.13 -1.1 -1.07 -1.2 -1.37 -1.15 -1.11 ...
## ..$ CYP2c29 : num [1:40] -0.06 -0.2 -0.3 -0.07 -0.29 -0.28 -0.1 -0.1 0.18 -0.33 ...
## ..$ CYP3A11 : num [1:40] -0.09 -0.34 -0.45 -0.11 -0.51 -0.55 -0.18 -0.25 0.06 -0.4 ...
## ..$ CYP4A10 : num [1:40] -0.81 -0.88 -0.71 -0.65 -1.16 -0.99 -0.62 -0.82 -0.48 -0.79 ...
## ..$ CYP4A14 : num [1:40] -0.81 -0.84 -0.98 -0.41 -1.16 -1.09 -0.76 -0.87 -0.37 -0.95 ...
## ..$ CYP7a : num [1:40] -0.77 -0.71 -0.93 -0.8 -0.71 -0.74 -0.76 -0.88 -0.77 -0.77 ...
## ..$ CYP8b1 : num [1:40] -0.77 -0.63 -0.53 -0.73 -0.51 -0.55 -0.57 -0.63 -0.6 -0.66 ...
## ..$ FAS : num [1:40] -0.41 -0.37 -0.3 -0.59 -0.06 0.18 -0.16 0.04 -0.53 0.08 ...
## ..$ FAT : num [1:40] -1.03 -0.98 -1.03 -1.06 -0.99 -0.99 -0.89 -1.08 -1.04 -0.91 ...
## ..$ FDFT : num [1:40] -0.98 -0.92 -1.04 -1 -0.99 -1 -1.02 -0.97 -1.03 -0.95 ...
## ..$ FXR : num [1:40] -0.93 -0.87 -1 -0.9 -0.89 -0.89 -0.86 -1.01 -0.81 -0.91 ...
## ..$ G6PDH : num [1:40] -1.22 -1.09 -1.28 -1.19 -1.16 -0.96 -1.15 -1.26 -1.13 -1.03 ...
## ..$ G6Pase : num [1:40] -0.46 -0.63 -1.06 -0.71 -0.58 -0.49 -0.51 -0.61 -0.38 -0.6 ...
## ..$ GK : num [1:40] -0.71 -0.67 -0.68 -0.75 -0.62 -0.59 -0.59 -0.66 -0.68 -0.47 ...
## ..$ GS : num [1:40] -1.24 -1.22 -1.36 -1.21 -1.22 -1.16 -1.15 -1.31 -1.16 -1.19 ...
## ..$ GSTa : num [1:40] 0 -0.05 -0.13 -0.09 -0.02 -0.11 -0.06 -0.04 0.03 -0.02 ...
## ..$ GSTmu : num [1:40] 0.02 -0.05 -0.19 0.03 -0.23 -0.05 -0.22 -0.07 0.23 -0.14 ...
## ..$ GSTpi2 : num [1:40] 0.45 0.3 0.18 0.36 0.3 0.17 0.12 0.48 0.53 0.01 ...
## ..$ HMGCoAred: num [1:40] -0.95 -0.86 -0.96 -1.02 -0.7 -0.76 -1 -0.88 -0.96 -0.7 ...
## ..$ HPNCL : num [1:40] -0.65 -0.69 -0.75 -0.61 -0.66 -0.56 -0.61 -0.71 -0.53 -0.6 ...
## ..$ IL.2 : num [1:40] -0.94 -0.94 -1.16 -0.97 -0.93 -0.96 -0.96 -0.85 -0.84 -0.95 ...
## ..$ L.FABP : num [1:40] 0.24 0.27 0.17 0.16 0 0.23 0.18 0.18 0.2 0.2 ...
## ..$ LCE : num [1:40] 0.09 0.06 -0.05 0.01 -0.07 -0.1 -0.03 -0.08 0.12 -0.1 ...
## ..$ LDLr : num [1:40] -0.82 -0.68 -0.82 -0.94 -0.73 -0.74 -0.8 -0.83 -0.81 -0.72 ...
## ..$ LPK : num [1:40] -0.32 -0.39 -0.38 -0.38 -0.17 -0.14 -0.35 -0.13 -0.32 -0.24 ...
## ..$ LPL : num [1:40] -1.01 -0.97 -1.11 -0.99 -1.05 -0.99 -0.93 -1.07 -0.94 -0.95 ...
## ..$ LXRa : num [1:40] -0.82 -0.82 -0.91 -0.85 -0.83 -0.79 -0.77 -0.84 -0.75 -0.78 ...
## ..$ LXRb : num [1:40] -1 -0.95 -1.16 -1.01 -1.01 -0.99 -0.98 -1.04 -0.98 -0.99 ...
## ..$ Lpin : num [1:40] -0.87 -0.97 -0.95 -1 -0.57 -0.51 -0.81 -0.83 -0.83 -0.48 ...
## ..$ Lpin1 : num [1:40] -0.85 -0.99 -0.94 -1.02 -0.53 -0.51 -0.81 -0.87 -0.82 -0.49 ...
## ..$ Lpin2 : num [1:40] -0.85 -0.87 -0.9 -0.88 -0.72 -0.68 -0.8 -0.9 -0.68 -0.67 ...
## ..$ Lpin3 : num [1:40] -1.23 -1.12 -1.25 -1.18 -1.12 -1.09 -1.04 -1.23 -1.13 -1.11 ...
## ..$ M.CPT1 : num [1:40] -1.15 -1.06 -1.26 -1.1 -1.11 -1.14 -1.08 -1.19 -1.06 -1.09 ...
## ..$ MCAD : num [1:40] -0.6 -0.62 -0.7 -0.59 -0.69 -0.66 -0.53 -0.66 -0.45 -0.62 ...
## ..$ MDR1 : num [1:40] -1.15 -1.1 -1.26 -1.13 -1.11 -1.09 -1.09 -1.19 -1.06 -1.1 ...
## ..$ MDR2 : num [1:40] -0.77 -0.65 -0.86 -0.77 -0.7 -0.69 -0.81 -0.81 -0.69 -0.75 ...
## ..$ MRP6 : num [1:40] -0.99 -0.85 -0.9 -0.95 -0.91 -0.84 -0.88 -1.02 -0.83 -0.86 ...
## ..$ MS : num [1:40] -1.11 -1.06 -1.2 -1.09 -1.09 -1.09 -0.99 -1.16 -1.06 -0.98 ...
## ..$ MTHFR : num [1:40] -0.96 -0.99 -1.1 -0.95 -0.93 -0.96 -0.88 -1.03 -1.01 -0.95 ...
## ..$ NGFiB : num [1:40] -1.21 -1.08 -1.24 -1.12 -1.11 -1.04 -1.02 -1.21 -1.11 -1.04 ...
## ..$ NURR1 : num [1:40] -1.21 -1.1 -1.32 -1.11 -1.14 -1.18 -1.1 -1.26 -1.14 -1.09 ...
## ..$ Ntcp : num [1:40] -0.49 -0.45 -0.44 -0.54 -0.47 -0.46 -0.55 -0.5 -0.44 -0.43 ...
## ..$ OCTN2 : num [1:40] -1.15 -1.15 -1.2 -1.17 -1.19 -1.11 -1.08 -1.21 -1.05 -1.08 ...
## ..$ PAL : num [1:40] -1.32 -1.25 -1.16 -1.25 -1.24 -1.02 -1.04 -1.27 -0.93 -0.92 ...
## ..$ PDK4 : num [1:40] -1.16 -1.16 -1.27 -1.16 -1.13 -1.08 -1.14 -1.24 -1.19 -1.04 ...
## ..$ PECI : num [1:40] -0.68 -0.69 -0.92 -0.71 -0.83 -0.81 -0.79 -0.85 -0.58 -0.82 ...
## ..$ PLTP : num [1:40] -1.1 -0.99 -1.03 -1.08 -0.98 -0.89 -1.05 -1.07 -1.02 -0.85 ...
## ..$ PMDCI : num [1:40] -0.52 -0.52 -0.6 -0.52 -0.71 -0.69 -0.55 -0.57 -0.46 -0.69 ...
## ..$ PON : num [1:40] -0.52 -0.55 -0.65 -0.64 -0.57 -0.63 -0.56 -0.65 -0.6 -0.64 ...
## ..$ PPARa : num [1:40] -0.93 -0.86 -0.95 -0.97 -0.94 -0.95 -0.9 -1.12 -0.88 -0.95 ...
## ..$ PPARd : num [1:40] -1.51 -1.59 -1.71 -1.57 -1.53 -1.56 -1.49 -1.57 -1.58 -1.54 ...
## ..$ PPARg : num [1:40] -1.06 -1.02 -1.14 -1.05 -1.09 -1.01 -1 -1.13 -0.97 -1.07 ...
## ..$ PXR : num [1:40] -0.99 -0.96 -1.1 -0.99 -1 -1.03 -0.93 -1.07 -0.98 -0.96 ...
## ..$ Pex11a : num [1:40] -1 -1.02 -1.2 -1 -0.95 -1.07 -1.05 -1.02 -1 -1.01 ...
## ..$ RARa : num [1:40] -1.2 -1.06 -1.16 -1.17 -1.15 -1.13 -1.09 -1.24 -1.03 -1.09 ...
## ..$ RARb2 : num [1:40] -1.19 -1.11 -1.23 -1.16 -1.14 -1.07 -1.09 -1.18 -1.12 -1.1 ...
## ..$ RXRa : num [1:40] -0.67 -0.59 -0.68 -0.72 -0.78 -0.62 -0.65 -0.76 -0.55 -0.67 ...
## ..$ RXRb2 : num [1:40] -0.95 -0.95 -1.07 -0.95 -0.98 -0.94 -0.92 -1.03 -0.94 -0.95 ...
## ..$ RXRg1 : num [1:40] -1.16 -1.1 -1.21 -1.1 -1.11 -1.03 -1.07 -1.19 -1.05 -1.04 ...
## ..$ S14 : num [1:40] -0.93 -0.86 -0.84 -1.05 -0.65 -0.4 -0.73 -0.62 -0.99 -0.25 ...
## ..$ SHP1 : num [1:40] -1.1 -0.97 -1.09 -1.03 -1.13 -0.98 -0.95 -1.21 -0.93 -0.97 ...
## ..$ SIAT4c : num [1:40] -1.07 -0.97 -1.04 -0.99 -0.94 -0.93 -0.89 -1.04 -0.93 -0.95 ...
## ..$ SPI1.1 : num [1:40] 1.19 1.15 1.09 1.07 1.22 1.05 1.15 1.18 1.21 1.04 ...
## ..$ SR.BI : num [1:40] -0.84 -0.86 -0.95 -0.95 -1.06 -0.8 -0.83 -1 -0.83 -0.77 ...
## ..$ THB : num [1:40] -0.79 -0.85 -0.92 -0.79 -0.84 -0.86 -0.8 -0.86 -0.83 -0.85 ...
## ..$ THIOL : num [1:40] -0.18 -0.15 -0.24 -0.15 -0.35 -0.29 -0.22 -0.23 -0.17 -0.18 ...
## ..$ TRa : num [1:40] -1.48 -1.46 -1.58 -1.54 -1.46 -1.44 -1.32 -1.56 -1.46 -1.35 ...
## ..$ TRb : num [1:40] -1.07 -1 -1.16 -1.11 -1.01 -1 -0.97 -1.08 -1.02 -0.98 ...
## ..$ Tpalpha : num [1:40] -0.69 -0.74 -0.81 -0.74 -0.82 -0.76 -0.72 -0.76 -0.65 -0.83 ...
## ..$ Tpbeta : num [1:40] -1.11 -1.09 -1.14 -1.04 -1.2 -1.05 -1 -1.16 -0.91 -1.07 ...
## .. [list output truncated]
## $ lipid :'data.frame': 40 obs. of 21 variables:
## ..$ C14.0 : num [1:40] 0.34 0.38 0.36 0.22 0.37 1.7 0.35 0.34 0.22 1.38 ...
## ..$ C16.0 : num [1:40] 26.4 24 23.7 25.5 24.8 ...
## ..$ C18.0 : num [1:40] 10.22 9.93 8.96 8.14 9.63 ...
## ..$ C16.1n.9: num [1:40] 0.35 0.55 0.55 0.49 0.46 0.66 0.36 0.29 0.44 0.9 ...
## ..$ C16.1n.7: num [1:40] 3.1 2.54 2.65 2.82 2.85 7.26 3.6 3.27 2.36 7.01 ...
## ..$ C18.1n.9: num [1:40] 17 20.1 22.9 21.9 21.4 ...
## ..$ C18.1n.7: num [1:40] 2.41 3.92 3.96 2.52 2.96 8.99 2.15 1.99 1.81 8.85 ...
## ..$ C20.1n.9: num [1:40] 0.26 0.23 0.26 0 0.3 0.36 0.25 0.31 0 0.21 ...
## ..$ C20.3n.9: num [1:40] 0 0 0.19 0 0.27 2.89 0 0 0 2.03 ...
## ..$ C18.2n.6: num [1:40] 8.93 14.98 16.06 13.89 14.55 ...
## ..$ C18.3n.6: num [1:40] 0 0.3 0.27 0 0.27 2.66 0 0 0 0 ...
## ..$ C20.2n.6: num [1:40] 0 0.3 0.33 0 0.23 0 0 0 0 0 ...
## ..$ C20.3n.6: num [1:40] 0.78 1.64 1.51 1.1 1.58 0.81 0.68 0.72 1.07 0.59 ...
## ..$ C20.4n.6: num [1:40] 3.07 15.34 13.27 3.92 11.85 ...
## ..$ C22.4n.6: num [1:40] 0 0.58 0.54 0 0.32 0 0 0 0 0 ...
## ..$ C22.5n.6: num [1:40] 0 2.1 1.77 0 0.44 0.56 0 0 0 0.39 ...
## ..$ C18.3n.3: num [1:40] 5.97 0 0 0.49 0.42 0 8.4 6.01 0.55 0 ...
## ..$ C20.3n.3: num [1:40] 0.37 0 0 0 0 0 0.42 0.39 0 0 ...
## ..$ C20.5n.3: num [1:40] 8.62 0 0 2.99 0.3 0 7.37 7.96 3.13 0 ...
## ..$ C22.5n.3: num [1:40] 1.75 0.48 0.22 1.04 0.35 2.13 2.05 2.33 1.65 0 ...
## ..$ C22.6n.3: num [1:40] 10.39 2.61 2.51 14.99 6.69 ...
## $ diet : Factor w/ 5 levels "coc","fish","lin",..: 3 5 5 2 4 1 3 3 2 1 ...
## $ genotype: Factor w/ 2 levels "wt","ppar": 1 1 1 1 1 1 1 1 1 1 ...
## check dimensions
lapply(nutrimouse, dim) # apply function dim to each element in list nutrimouse
## $gene
## [1] 40 120
##
## $lipid
## [1] 40 21
##
## $diet
## NULL
##
## $genotype
## NULL
lapply(nutrimouse, length) # apply function length to each element in list nutrimouse
## $gene
## [1] 120
##
## $lipid
## [1] 21
##
## $diet
## [1] 40
##
## $genotype
## [1] 40
samples x
variables matrix format. Investigate variables’
distribution.## get gene expression data structure
class(nutrimouse$gene)
## [1] "data.frame"
dim(nutrimouse$gene)
## [1] 40 120
rownames(nutrimouse$gene)[1:10]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
colnames(nutrimouse$gene)[1:10]
## [1] "X36b4" "ACAT1" "ACAT2" "ACBP" "ACC1" "ACC2" "ACOTH" "ADISP" "ADSS1"
## [10] "ALDH3"
## check if there are missing values
any(is.na(nutrimouse$gene))
## [1] FALSE
## investigate each variable
summary(nutrimouse$gene[, 1:4])
## X36b4 ACAT1 ACAT2 ACBP
## Min. :-0.5800 Min. :-0.7500 Min. :-1.1000 Min. :-0.6600
## 1st Qu.:-0.5025 1st Qu.:-0.6900 1st Qu.:-0.8800 1st Qu.:-0.5025
## Median :-0.4600 Median :-0.6600 Median :-0.7950 Median :-0.4250
## Mean :-0.4552 Mean :-0.6552 Mean :-0.7668 Mean :-0.4338
## 3rd Qu.:-0.4200 3rd Qu.:-0.6200 3rd Qu.:-0.6450 3rd Qu.:-0.3550
## Max. :-0.3000 Max. :-0.5200 Max. :-0.3900 Max. :-0.2400
colors <- rainbow(20, alpha=1)
plot(density(scale(nutrimouse$gene[, 1], center=T, scale=F)),
col=colors[1], xlim=c(-0.5,0.5), ylim=c(0,8),
main='Density plot of the first 20 genes')
invisible(sapply(2:20, function(i) {
lines(density(scale(nutrimouse$gene[, i], center=T, scale=F)), col=colors[i])
}))
A number of methods in different R packages can perform PCA,
e.g. stats::prcomp, stats::princomp,
mixOmics::pca, multiblock::pca,
psych::principal, FactoMineR::PCA, etc.
pca.res <- prcomp(nutrimouse$gene, center=TRUE, scale.=F)
names(pca.res)
## [1] "sdev" "rotation" "center" "scale" "x"
summary(pca.res)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 0.6763 0.5064 0.4033 0.28206 0.24164 0.19445 0.17513
## Proportion of Variance 0.3497 0.1961 0.1244 0.06084 0.04465 0.02891 0.02345
## Cumulative Proportion 0.3497 0.5458 0.6702 0.73107 0.77572 0.80463 0.82808
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.16498 0.14796 0.13623 0.13425 0.11505 0.11208 0.11052
## Proportion of Variance 0.02081 0.01674 0.01419 0.01378 0.01012 0.00961 0.00934
## Cumulative Proportion 0.84889 0.86563 0.87983 0.89361 0.90373 0.91333 0.92267
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.10450 0.09952 0.09052 0.08962 0.07914 0.07511 0.07313
## Proportion of Variance 0.00835 0.00757 0.00627 0.00614 0.00479 0.00431 0.00409
## Cumulative Proportion 0.93102 0.93860 0.94486 0.95101 0.95579 0.96011 0.96420
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.06913 0.06708 0.06308 0.06186 0.06029 0.05810 0.05639
## Proportion of Variance 0.00365 0.00344 0.00304 0.00293 0.00278 0.00258 0.00243
## Cumulative Proportion 0.96785 0.97129 0.97434 0.97726 0.98004 0.98262 0.98505
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.05151 0.04984 0.04840 0.04724 0.04602 0.04083 0.03979
## Proportion of Variance 0.00203 0.00190 0.00179 0.00171 0.00162 0.00127 0.00121
## Cumulative Proportion 0.98708 0.98898 0.99077 0.99248 0.99410 0.99538 0.99659
## PC36 PC37 PC38 PC39 PC40
## Standard deviation 0.03680 0.03468 0.03282 0.02883 1.865e-15
## Proportion of Variance 0.00104 0.00092 0.00082 0.00064 0.000e+00
## Cumulative Proportion 0.99762 0.99854 0.99936 1.00000 1.000e+00
Variances = eigenvalues of the covariance matrix = (standard deviation)^2.
variances <- pca.res$sdev^2
variances
## [1] 4.573742e-01 2.564410e-01 1.626804e-01 7.955690e-02 5.838751e-02
## [6] 3.780991e-02 3.066913e-02 2.721979e-02 2.189256e-02 1.855921e-02
## [11] 1.802243e-02 1.323667e-02 1.256153e-02 1.221509e-02 1.091924e-02
## [16] 9.905054e-03 8.193579e-03 8.032182e-03 6.262595e-03 5.641794e-03
## [21] 5.348430e-03 4.779376e-03 4.500169e-03 3.978795e-03 3.826089e-03
## [26] 3.634453e-03 3.376009e-03 3.179730e-03 2.653212e-03 2.484088e-03
## [31] 2.342963e-03 2.231208e-03 2.117845e-03 1.667351e-03 1.583188e-03
## [36] 1.353925e-03 1.202714e-03 1.076873e-03 8.311648e-04 3.480070e-30
Scree plot: plot of variances.
screeplot(pca.res, npcs=length(variances), type='lines')
screeplot(pca.res, npcs=length(variances), type='barplot')
barplot(variances, xlab='PC', ylab='Variance', names.arg=1:length(variances))
Scree plot on variance percentage.
varPercent <- variances/sum(variances) * 100
barplot(varPercent, xlab='PC', ylab='Percent Variance', names.arg=1:length(varPercent))
Scores: sample coordinates in the new reference (rotated axes or principal components).
scores <- pca.res$x
str(scores)
## num [1:40, 1:40] -0.5036 -0.6119 0.0596 -0.4686 -0.2457 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:40] "1" "2" "3" "4" ...
## ..$ : chr [1:40] "PC1" "PC2" "PC3" "PC4" ...
Score plot: plot of sample distribution.
PCx <- "PC1"
PCy <- "PC2"
plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy, pch=16)
text(scores[, PCx], scores[, PCy]-0.05, rownames(scores), col='blue', cex=0.7)
Loadings: contributions of variables to principal components (eigenvectors of covariance matrix).
loadings <- pca.res$rotation
str(loadings)
## num [1:120, 1:40] -0.0425 -0.023 -0.0131 -0.107 -0.0618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:120] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
## ..$ : chr [1:40] "PC1" "PC2" "PC3" "PC4" ...
Loading plot: plot of variables’ contribution, revealing their relationship.
plot(loadings[, PCx], loadings[, PCy], type='n', main="Loadings")
arrows(0, 0, loadings[, PCx], loadings[, PCy], xlab=PCx, ylab=PCy,
length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(loadings[, c(PCx, PCy)], 1, norm, "2")))
text(loadings[, PCx], loadings[, PCy], rownames(loadings), col='grey', cex=0.7)
Both score and loading plots can be plotted altogether with the
biplot function.
## biplot
biplot(pca.res, expand=1, cex=c(0.5, 0.7), col=c("gray50", "red"))
library(factoextra)
fviz_pca_biplot(pca.res, repel = TRUE,
col.var = "blue", # Variables color
habillage = nutrimouse$genotype,
addEllipses = T,
legend="none"
)
The samples can be colored with some metadata, e.g genotype or diet,
plot(scores[, PCx], scores[, PCy], main="Scores",
col=c(1:nlevels(nutrimouse$diet))[nutrimouse$diet],
pch=c(17,19)[nutrimouse$genotype],
xlab=paste0(PCx,
" (",
round((summary(pca.res)$importance)[2, PCx], 2), ")"),
ylab=paste0(PCy,
" (",
round((summary(pca.res)$importance)[2, PCy], 2), ")")
)
legend("topright", title="genotype",
legend=levels(nutrimouse$genotype),
pch=c(17,19), cex=0.7)
legend("bottomright", title="diet",
legend=levels(nutrimouse$diet),
col=c(1:5), cex=0.7, pch=16)
or by some gene expression.
nbreaks <- 5
plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy,
pch=c(17,19)[nutrimouse$genotype],
col=colorRampPalette(c('red','blue'))(nbreaks)[as.numeric(cut(nutrimouse$gene$ALDH3,breaks = nbreaks))])
mixOmics::pls) between gene and lipid.
Investigate its output, sample distribution and variable relationship
with plots.pls.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical")
max(abs(scale(nutrimouse$gene, center=T, scale=T) - pls.res$X))
## [1] 1.332268e-15
max(abs(scale(nutrimouse$lipid, center=T, scale=T) - pls.res$Y))
## [1] 8.881784e-16
The sample distribution plot can be performed with variates, sample coordinates in the new reference (rotated axes) for each of the two blocks,
str(pls.res$variates)
## List of 2
## $ X: num [1:40, 1:2] 6.659 0.614 9.876 4.864 0.934 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "comp1" "comp2"
## $ Y: num [1:40, 1:2] 2.33 2.6 1.98 1.94 2.29 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "comp1" "comp2"
PCx <- "comp1"
PCy <- "comp2"
par(mfrow=c(1,2))
plot(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n')
text(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], rownames(pls.res$variates$X), col='blue', cex=0.6)
plot(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n')
text(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], rownames(pls.res$variates$Y), col='blue', cex=0.6)
which is also produced with plotIndiv.
plotIndiv(pls.res)
Loading plot: plot of variables’ contribution in each data block to each variate, after deflating more important variates,
par(mfrow=c(1,2), las=2, mar=c(4,8,1,1))
loadings.ind.X <- order(abs(pls.res$loadings$X[, "comp1"]), decreasing = T)
barplot(head(pls.res$loadings$X[loadings.ind.X, "comp1"], 10), main="X", horiz = T, cex.names=0.8)
loadings.ind.Y <- order(abs(pls.res$loadings$Y[, "comp1"]), decreasing = T)
barplot(head(pls.res$loadings$Y[loadings.ind.Y, "comp1"], 10), main="Y", horiz = T, cex.names=0.8)
which is the same as with plotLoadings.
plotLoadings(pls.res, ndisplay = 10)
The plot of variable relationship could be obtained from loadings.star.
names(pls.res$loadings.star) <- c("X", "Y")
colnames(pls.res$loadings.star$X) <- colnames(pls.res$loadings.star$Y) <- c(PCx, PCy)
plot(1,1,type='n',
xlim=range(c(pls.res$loadings.star$X[, PCx],pls.res$loadings.star$Y[, PCx])),
ylim=range(c(pls.res$loadings.star$X[, PCy],pls.res$loadings.star$Y[, PCy])))
arrows(0, 0, pls.res$loadings.star$X[, PCx], pls.res$loadings.star$X[, PCy],
length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(pls.res$loadings.star$X[, c(PCx, PCy)], 1, norm, "2")))
text(pls.res$loadings.star$X[, PCx],
pls.res$loadings.star$X[, PCy],
rownames(pls.res$loadings.star$X), col='grey', cex=0.7)
arrows(0, 0, pls.res$loadings.star$Y[, PCx], pls.res$loadings.star$Y[, PCy],
length=0.1, angle=20, col=rgb(1,0,0,alpha=apply(pls.res$loadings.star$Y[, c(PCx, PCy)], 1, norm, "2")))
text(pls.res$loadings.star$Y[, PCx],
pls.res$loadings.star$Y[, PCy],
rownames(pls.res$loadings.star$Y), col='grey', cex=0.7)
plotVar(pls.res)
Both sample distribution and variable relationship plot could be done
with biplot function.
biplot(pls.res, block="X", ind.names.size=3, var.names.size=2)
biplot(pls.res, block="Y", ind.names.size=3, var.names.size=2)
pls.reg.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="regression")
pls.can.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical")
par(mfrow=c(2,2))
biplot(pls.reg.res, block="X", ind.names.size=3, var.names.size=2)
biplot(pls.can.res, block="X", ind.names.size=3, var.names.size=2)
biplot(pls.reg.res, block="Y", ind.names.size=3, var.names.size=2)
biplot(pls.can.res, block="Y", ind.names.size=3, var.names.size=2)
mixOmics::plsda) between gene and
genotype. Redo PLS-DA using mixOmics::pls and compare the
results.pls.da.res <- plsda(X=nutrimouse$gene, Y=nutrimouse$genotype, ncomp=2, scale=TRUE)
pls.regda.res <- pls(X=nutrimouse$gene, Y=c(0,1)[nutrimouse$genotype], ncomp=2, scale=TRUE, mode="regression")
par(mfrow=c(1,2))
biplot(pls.da.res, block="X", ind.names.size=3, var.names.size=2,
group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype')
biplot(pls.regda.res, block="X", ind.names.size=3, var.names.size=2,
group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype')
ropls::opls) between gene and
genotype. Investigate its output, sample distribution, variable
relationship and predictive performance.(https://bioconductor.org/packages/release/bioc/vignettes/ropls/inst/doc/ropls-vignette.html)
library(ropls)
opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, fig.pdfC='none')
## OPLS-DA
## 40 samples x 120 variables and 1 response
## standard scaling of predictors and response(s)
## R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y pQ2
## Total 0.615 0.975 0.935 0.083 1 2 0.05 0.05
par(mfrow=c(2,2))
plot(opls.res, typeVc='x-score')
plot(opls.res, typeVc='x-loading')
plot(opls.res, typeVc='overview')
plot(opls.res, typeVc='correlation')
opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, permI=100, fig.pdfC='none')
## OPLS-DA
## 40 samples x 120 variables and 1 response
## standard scaling of predictors and response(s)
## R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y pQ2
## Total 0.615 0.975 0.935 0.083 1 2 0.01 0.01
plot(opls.res, typeVc='permutation')
opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, subset=c(1:13, 21:33), fig.pdfC='none')
## OPLS-DA
## 26 samples x 120 variables and 1 response
## standard scaling of predictors and response(s)
## R2X(cum) R2Y(cum) Q2(cum) RMSEE RMSEP pre ort
## Total 0.646 0.983 0.915 0.0704 0.134 1 2
par(mfrow=c(1,2))
plot(opls.res, typeVc='predict-train')
plot(opls.res, typeVc='predict-test')
# confusion matrix on training set
train_id <- getSubsetVi(opls.res)
table(nutrimouse$genotype[train_id], fitted(opls.res))
##
## wt ppar
## wt 13 0
## ppar 0 13
# confusion matrix on test set
table(nutrimouse$genotype[-train_id], predict(opls.res, nutrimouse$gene[-train_id,]))
##
## wt ppar
## wt 7 0
## ppar 0 7
mixOmics::rcc) between 20 genes and all
lipids. Investigate correlations, sample distribution and variable
relationship with plots.The gene expression data is reduced to 20 genes so that the number of variables is less than the number of samples, to perform an unregularized CCA.
nutrimouse$gene_selected <- nutrimouse$gene[, 1:20]
str(nutrimouse$gene_selected)
## 'data.frame': 40 obs. of 20 variables:
## $ X36b4: num -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
## $ ACAT1: num -0.65 -0.68 -0.74 -0.69 -0.71 -0.69 -0.62 -0.69 -0.66 -0.62 ...
## $ ACAT2: num -0.84 -0.91 -1.1 -0.65 -0.54 -0.8 -1 -0.91 -0.74 -0.79 ...
## $ ACBP : num -0.34 -0.32 -0.46 -0.41 -0.38 -0.32 -0.44 -0.37 -0.39 -0.36 ...
## $ ACC1 : num -1.29 -1.23 -1.3 -1.26 -1.21 -1.13 -1.22 -1.29 -1.15 -1.21 ...
## $ ACC2 : num -1.13 -1.06 -1.09 -1.09 -0.89 -0.79 -1 -1.06 -1.08 -0.82 ...
## $ ACOTH: num -0.93 -0.99 -1.06 -0.93 -1 -0.93 -0.94 -1.05 -0.88 -0.92 ...
## $ ADISP: num -0.98 -0.97 -1.08 -1.02 -0.95 -0.97 -0.94 -1.02 -0.98 -0.99 ...
## $ ADSS1: num -1.19 -1 -1.18 -1.07 -1.08 -1.07 -1.05 -1.16 -1.05 -1 ...
## $ ALDH3: num -0.68 -0.62 -0.75 -0.71 -0.76 -0.75 -0.67 -0.75 -0.66 -0.69 ...
## $ AM2R : num -0.59 -0.58 -0.66 -0.65 -0.59 -0.55 -0.66 -0.66 -0.53 -0.62 ...
## $ AOX : num -0.16 -0.12 -0.16 -0.17 -0.31 -0.23 -0.09 -0.22 -0.06 -0.23 ...
## $ BACT : num -0.22 -0.32 -0.32 -0.32 -0.31 -0.29 -0.25 -0.21 -0.15 -0.2 ...
## $ BIEN : num -0.89 -0.88 -0.89 -0.77 -0.97 -0.84 -0.86 -0.9 -0.74 -0.76 ...
## $ BSEP : num -0.69 -0.6 -0.7 -0.67 -0.68 -0.55 -0.67 -0.66 -0.6 -0.58 ...
## $ Bcl.3: num -1.18 -1.07 -1.17 -1.12 -0.93 -1.08 -1.03 -1.01 -1.01 -1.1 ...
## $ C16SR: num 1.66 1.65 1.57 1.61 1.66 1.7 1.58 1.62 1.72 1.55 ...
## $ CACP : num -0.92 -0.87 -1.02 -0.89 -0.93 -0.97 -0.97 -0.96 -0.85 -0.95 ...
## $ CAR1 : num -0.97 -0.92 -0.98 -0.97 -1.06 -1.03 -0.91 -1.11 -0.85 -0.99 ...
## $ CBS : num -0.26 -0.36 -0.4 -0.39 -0.35 -0.31 -0.32 -0.4 -0.26 -0.39 ...
cca.res <- rcc(X=nutrimouse$gene_selected, Y=nutrimouse$lipid, ncomp=2)
max(abs(nutrimouse$gene_selected - cca.res$X))
## [1] 0
max(abs(nutrimouse$lipid - cca.res$Y))
## [1] 0
str(cca.res)
## List of 11
## $ call : language rcc(X = nutrimouse$gene_selected, Y = nutrimouse$lipid, ncomp = 2)
## $ X : num [1:40, 1:20] -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
## $ Y : num [1:40, 1:21] 0.34 0.38 0.36 0.22 0.37 1.7 0.35 0.34 0.22 1.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
## $ ncomp : num 2
## $ method : chr "ridge"
## $ cor : Named num [1:20] 1 1 0.999 0.996 0.981 ...
## ..- attr(*, "names")= chr [1:20] "1" "2" "3" "4" ...
## $ loadings :List of 2
## ..$ X: num [1:20, 1:2] 1.408 4.802 3.234 -7.373 -0.724 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
## .. .. ..$ : NULL
## ..$ Y: num [1:21, 1:2] 1.1112 -0.1436 -0.4625 -1.0203 -0.0901 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
## .. .. ..$ : NULL
## $ variates :List of 2
## ..$ X: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. .. ..$ : NULL
## ..$ Y: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. .. ..$ : NULL
## $ names :List of 4
## ..$ sample : chr [1:40] "1" "2" "3" "4" ...
## ..$ colnames:List of 2
## .. ..$ X: chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
## .. ..$ Y: chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
## ..$ blocks : chr [1:2] "X" "Y"
## ..$ data : chr [1:2] "nutrimouse$gene_selected" "nutrimouse$lipid"
## $ lambda : Named num [1:2] 0 0
## ..- attr(*, "names")= chr [1:2] "lambda1" "lambda2"
## $ prop_expl_var:List of 2
## ..$ X: Named num [1:2] 0.00132 0.0024
## .. ..- attr(*, "names")= chr [1:2] "comp1" "comp2"
## ..$ Y: Named num [1:2] 0.0184 0.0299
## .. ..- attr(*, "names")= chr [1:2] "comp1" "comp2"
## - attr(*, "class")= chr "rcc"
cca.res$cor
## 1 2 3 4 5 6 7
## 1.00000000 1.00000000 0.99922446 0.99607902 0.98142435 0.95641141 0.89083472
## 8 9 10 11 12 13 14
## 0.88959894 0.78648273 0.76470925 0.75189350 0.66984945 0.63240310 0.53662009
## 15 16 17 18 19 20
## 0.49948385 0.34852831 0.33274136 0.27818295 0.22569639 0.03783839
The sample distribution plot can be performed with variates, sample coordinates in the new reference (rotated axes) for each of the two blocks.
str(cca.res$variates)
## List of 2
## $ X: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : NULL
## $ Y: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:40] "1" "2" "3" "4" ...
## .. ..$ : NULL
PCx <- 1
PCy <- 2
par(mfrow=c(1,2), las=1, mar=c(4,3,1,1))
plot(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n')
text(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], rownames(cca.res$variates$X), col='blue', cex=0.6)
plot(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n')
text(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], rownames(cca.res$variates$Y), col='blue', cex=0.6)
cor(cca.res$variates$X[,1], cca.res$variates$Y[,1])
## [1] 1
cor(cca.res$variates$X[,2], cca.res$variates$Y[,2])
## [1] 1
plotIndiv(cca.res)
plotArrow(cca.res)
Variable relationship is obtained from loadings or
with plotVar.
par(mfrow=c(1,2), las=2, mar=c(4,8,1,1))
loadings.ind.X <- order(abs(cca.res$loadings$X[, 1]), decreasing = T)
barplot(head(cca.res$loadings$X[loadings.ind.X, 1], 10), main="X", horiz = T, cex.names=0.8)
loadings.ind.Y <- order(abs(cca.res$loadings$Y[, 1]), decreasing = T)
barplot(head(cca.res$loadings$Y[loadings.ind.Y, 1], 10), main="Y", horiz = T, cex.names=0.8)
max(abs(cca.res$variates$X - scale(cca.res$X, center=T, scale=F) %*% cca.res$loadings$X))
max(abs(cca.res$variates$Y - scale(cca.res$Y, center=T, scale=F) %*% cca.res$loadings$Y))
plotVar(cca.res)
cca.res.scale <- rcc(X=scale(nutrimouse$gene_selected, center=T, scale=T),
Y=scale(nutrimouse$lipid, center=T, scale=T), ncomp=2)
max(abs(cca.res.scale$cor - cca.res$cor))
## [1] 1.149648e-10
max(abs(cca.res.scale$variates$X - cca.res$variates$X))
## [1] 2.64486
max(abs(cca.res.scale$variates$Y - cca.res$variates$Y))
## [1] 2.64486
max(abs(cca.res.scale$loadings$X - cca.res$loadings$X))
## [1] 7.845927
max(abs(cca.res.scale$loadings$Y - cca.res$loadings$Y))
## [1] 123.5061
rcca.res <- rcc(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, method="shrinkage")
plotVar(rcca.res)